About the data used

The Wine Quality Data Set is a collection of data on wines with 13 attributes and 6497 records when merged. The original data is subdivided into two separate data sets, one for white wine and the other for red wines. These attributes include: fixed acidity, volatile acidity, citric acid, residual sugar, chlorides, free sulfur dioxide, total sulfur dioxide, density, pH, sulphates, alcohol, and quality (score between 0 and 10). a. The source for the dataset can be found here: i. Paulo Cortez, University of Minho, Guimarães, Portugal, http://www3.dsi.uminho.pt/pcortez Cerdeira, F. Almeida, T. Matos and J. Reis, Viticulture Commission of the Vinho Verde Region(CVRVV), Porto, Portugal @2009 ii. Weblink: http://archive.ics.uci.edu/ml/datasets/Wine+Quality

Prepare the data

Standardize the data, create dummy variables as needed, and segment for testing and training purposes.

#clear environment
rm(list = ls())
#load data via csv file
my_data<-read.csv("winequality_white_red.csv")
#assign all the correct numeric and factor variables here, set factor levels as needed
str(my_data)
## 'data.frame':    6497 obs. of  13 variables:
##  $ Wine_type           : Factor w/ 2 levels "red","White": 2 2 2 2 2 2 2 2 2 2 ...
##  $ fixed_acidity       : num  7 6.3 8.1 7.2 7.2 8.1 6.2 7 6.3 8.1 ...
##  $ volatile_acidity    : num  0.27 0.3 0.28 0.23 0.23 0.28 0.32 0.27 0.3 0.22 ...
##  $ citric_acid         : num  0.36 0.34 0.4 0.32 0.32 0.4 0.16 0.36 0.34 0.43 ...
##  $ residual_sugar      : num  20.7 1.6 6.9 8.5 8.5 6.9 7 20.7 1.6 1.5 ...
##  $ chlorides           : num  0.045 0.049 0.05 0.058 0.058 0.05 0.045 0.045 0.049 0.044 ...
##  $ free_sulfur_dioxide : num  45 14 30 47 47 30 30 45 14 28 ...
##  $ total_sulfur_dioxide: num  170 132 97 186 186 97 136 170 132 129 ...
##  $ density             : num  1.001 0.994 0.995 0.996 0.996 ...
##  $ pH                  : num  3 3.3 3.26 3.19 3.19 3.26 3.18 3 3.3 3.22 ...
##  $ sulphates           : num  0.45 0.49 0.44 0.4 0.4 0.44 0.47 0.45 0.49 0.45 ...
##  $ alcohol             : num  8.8 9.5 10.1 9.9 9.9 10.1 9.6 8.8 9.5 11 ...
##  $ quality             : int  6 6 6 6 6 6 6 6 6 6 ...
#exclude any variables you want by making them characters, conversely , include my making numeric, integer, factor, or recode levels as text to get names to appear as column headers later
my_data$Wine_type<-as.character((my_data$Wine_type))
my_data$quality<-as.factor((my_data$quality))
#get the column names
variable_names<- colnames(my_data)
numeric_columns<-c(ncol(my_data))
factor_list<-list(ncol(my_data))
dat<-c(ncol(my_data))

#scale all the ints
for(i in 1:ncol(my_data)){
if(is.numeric((my_data[,i]))){
numeric_columns[i]<-paste(colnames(my_data[i]),"_scaled", sep = "")
dat[[i]]<-as.data.frame(scale(my_data[,i]))
names(dat[[i]])<-numeric_columns[i]
assign(numeric_columns[i], dat[[i]]) 
  }
}
#create all the dummy variables
library(psych)
#creates a list of all the dummy coded variables
for(i in 1:ncol(my_data)){
  if(is.factor( my_data[,i])){
   factor_list[[i]]<- as.data.frame(dummy.code(my_data[,i]))
  }else{factor_list[[i]]<-"empty"}
}
my_data_cat<-data.frame(matrix(nrow =nrow(my_data), ncol =1))
my_data_cat<- my_data_cat[,-ncol(my_data_cat)]#removes NA column originally creadted
for(i in 1:length(factor_list)){
  if(is.data.frame(factor_list[[i]])&& !factor_list[[i]][1]=="empty"){
factor_list[[i]]<-factor_list[[i]][-1]
  my_data_cat<-do.call(cbind,list(factor_list[[i]], my_data_cat)) 
  }
  if((c("1") %in% (colnames(factor_list[[i]])[1]))|(c("1") %in% (colnames(factor_list[[i]])[1]))) {
  colnames(factor_list[[i]])<-variable_names[i]
}
  if(is.data.frame(factor_list[[i]][1]=="empty")){
    factor_list[[i]]<-as.data.frame(factor_list[[i]])
    names(factor_list[[i]])<-names(my_data[,i])
  }
}
#recombine the data inot one dataset
numeric_data<- sapply(.GlobalEnv, is.data.frame)
my_data_all<-do.call(cbind, mget(names(numeric_data)[numeric_data]))
#refine for analysis
#remove any dataset with "my_data." as column header
remove_old_columns<- colnames(my_data_all)
my_data_all_2<-my_data_all[, !grepl(glob2rx("my_data.*"), names(my_data_all))]
write.csv(my_data_all, "my_data_all.csv")
write.csv(my_data_all_2,"my_data_all_2.csv")

#add back in the Y factor
my_data_all_3<-cbind(my_data_all_2,my_data$quality, my_data$Wine_type)
write.csv(my_data_all_3, "my_data_all_3.csv")

set.seed(1)
my_data_all_3<-my_data_all_3[sample(nrow(my_data_all_3)),]

#Create 10 equally size folds
folds <- as.data.frame(cut(seq(1,nrow(my_data_all_3)),breaks=10,labels=FALSE))
my_data_all_3<-cbind(folds,my_data_all_3)

#save segmented portions to drive
for (i in 1:10){
  
assign(paste("set",i, sep =""), my_data_all_3[my_data_all_3$`cut(seq(1, nrow(my_data_all_3)), breaks = 10, labels = FALSE)`==i,])
  
write.csv(eval(parse(text= paste("set",i,sep=""))), paste("set",i,".csv", sep = ""))
head(my_data_all_3)
}

Logistic Regression Classification Models

General overview of data pre processing. There are significantly more records of white wines than red.

library(ggplot2)
library(plotly)
wine_data_type_summary_plot<- ggplot(my_data, aes(my_data$Wine_type, color = my_data$Wine_type, fill = my_data$Wine_type)) + geom_bar()
ggplotly(wine_data_type_summary_plot)

###Determine the best predictors for wine type

Use use crossvalidation to find a good lambda level, using misclassification error

Compile training data

set.seed(11)

#conduct all testing on set2-set10
my_train_data<-rbind(set2, set3, set4, set5, set6, set7, set8, set9, set10)
paste("proportion of training data:", "~",round(100*nrow(my_train_data)/nrow(my_data)), "%")
## [1] "proportion of training data: ~ 90 %"

Use crossvalidation to find good lambda level, using misclassification error

library(ggfortify)
library(ggplot2)
library(plotly)
library(glmnet)

cross_validated_fit <- cv.glmnet(x = as.matrix(my_train_data[,c(2:18)]), y = as.matrix(my_train_data[,ncol(my_train_data)]), family = "binomial", type.measure = "class", nfolds = 10, nlambda = 1000, alpha = 1)

Plot result of cross validation, using misclassification error

ggplotly(autoplot(cross_validated_fit, color = "orange"))

Show lasso plot for variable inspection and selection for models using misclassification error

ggplotly(autoplot(cross_validated_fit$glmnet.fit, "lambda", label = TRUE))

Logistic Models for Consideration

Based on crossvalidated plot of misclassification error and log (lambda) using a lambda 1 sd from min should be ok. Though it should be noted that one could use a log(lambda) of between -4 and -2 without a significant increase in misclassificaton error (still <.05) but with the added benefit of a more parsimonious model (needing ~6 variables vs 11 with recommended lambda).

Consequently, we will consider two models, one with the lowest misclassification levels and the other with highest level of parsimony.

Logistic Regression model with lowest misclassification levels

selected lambda level:

cross_validated_fit$lambda.1se
## [1] 0.002579141

Gather coef for selected lambda level

lasso_model1<-glmnet(x = as.matrix(my_train_data[,c(2:18)]), y = as.matrix(my_train_data[,ncol(my_train_data)]), family = "binomial", lambda = cross_validated_fit$lambda.1se, alpha = 1)

lasso_model1$beta
## 17 x 1 sparse Matrix of class "dgCMatrix"
##                                      s0
## my_data_cat.4                1.56360580
## my_data_cat.5                .         
## my_data_cat.6                .         
## my_data_cat.7                .         
## my_data_cat.8                .         
## my_data_cat.9                .         
## pH_scaled                   -0.32870198
## density_scaled              -2.31318165
## alcohol_scaled              -0.44769990
## sulphates_scaled            -0.66392192
## residual_sugar_scaled        1.75575020
## citric_acid_scaled           .         
## free_sulfur_dioxide_scaled   .         
## total_sulfur_dioxide_scaled  2.21239836
## volatile_acidity_scaled     -1.28172516
## fixed_acidity_scaled        -0.08478189
## chlorides_scaled            -0.54835257

Run model and gather predication data on training cases

#run predictions against actual test case
lasso_model1_train_prediction<-predict.glmnet(lasso_model1, newx = as.matrix(my_train_data[2:18]), s = cross_validated_fit$lambda.1se, type = "response" )
#convert results to probabilities via inverse logit
lasso_model1_train_prediction_probs<- exp(lasso_model1_train_prediction)/(1+exp(lasso_model1_train_prediction))

my_train_data_updated<-as.data.frame(cbind(my_train_data, lasso_model1_train_prediction_probs))
my_train_data_updated$predictions_logistic_model1<-ifelse(my_train_data_updated$`1`  >=.5,"White","red")

red wine confusion table–results on train case

my_confusion_table_train_class_red<-table(my_train_data_updated$`my_data$Wine_type`, my_train_data_updated$predictions_logistic_model1)
my_confusion_table_train_class_red
##        
##          red White
##   red   1406    22
##   White   13  4406
#columns= predictions
#rows = reference

Run model and gather predication data on test case

red wine confusion table–results on test case

#run predictions against actual test case
lasso_model1_test_prediction<-predict.glmnet(lasso_model1, newx = as.matrix(set1[,2:18]), s = cross_validated_fit$lambda.1se, type = "response" )
#convert results to probabilities via inverse logit
lasso_model1_test_prediction_probs<- exp(lasso_model1_test_prediction)/(1+exp(lasso_model1_test_prediction))

set1_updated<-as.data.frame(cbind(set1, lasso_model1_test_prediction_probs))
set1_updated$predictions_logistic_model1<-ifelse(set1_updated$`1`>=.5,"White","red")
my_confusion_table_test_class_red<-table(set1_updated$`my_data$Wine_type`, set1_updated$predictions_logistic_model1)
my_confusion_table_test_class_red
##        
##         red White
##   red   170     1
##   White   2   477
#columns= predictions
#rows = reference

Precision, recall, F1score, for model– class red training

TP_train_red<-my_confusion_table_train_class_red[1,1]
FN_train_red<-my_confusion_table_train_class_red[1,2]
FP_train_red<-my_confusion_table_train_class_red[2,1]
TN_train_red<-my_confusion_table_train_class_red[2,2] 

my_confusion_table_precision_train_class_red <- TP_train_red/(TP_train_red + FP_train_red)  
my_confusion_table_recall_train_class_red <- TP_train_red / (TP_train_red + FN_train_red)
my_confusion_table_F1score_train_class_red <-2*(my_confusion_table_precision_train_class_red*my_confusion_table_recall_train_class_red/(my_confusion_table_precision_train_class_red + my_confusion_table_recall_train_class_red))
paste(
  
  "precision_train_class_red = ", my_confusion_table_precision_train_class_red,
  "recall_train_class_red = ", my_confusion_table_recall_train_class_red,
      "F1_score_train_class_red = ",my_confusion_table_F1score_train_class_red
  )
## [1] "precision_train_class_red =  0.990838618745596 recall_train_class_red =  0.984593837535014 F1_score_train_class_red =  0.987706357569371"

Precision, recall, F1score, for model– class red testing

TP_test_red<-my_confusion_table_test_class_red[1,1]
FN_test_red<-my_confusion_table_test_class_red[1,2]
FP_test_red<-my_confusion_table_test_class_red[2,1]
TN_test_red<-my_confusion_table_test_class_red[2,2] 

my_confusion_table_precision_test_class_red <- TP_test_red/(TP_test_red + FP_test_red)  
my_confusion_table_recall_test_class_red <- TP_test_red / (TP_test_red + FN_test_red)
my_confusion_table_F1score_test_class_red <-2*(my_confusion_table_precision_test_class_red*my_confusion_table_recall_test_class_red/(my_confusion_table_precision_test_class_red + my_confusion_table_recall_test_class_red))
paste(
  
  "precision_test_class_red = ", my_confusion_table_precision_test_class_red,
  "recall_test_class_red = ", my_confusion_table_recall_test_class_red,
    "F1_score_test_class_red = ",my_confusion_table_F1score_test_class_red
  )
## [1] "precision_test_class_red =  0.988372093023256 recall_test_class_red =  0.994152046783626 F1_score_test_class_red =  0.991253644314869"

white wine confusion table–results on train case

my_confusion_table_train_class_White<-as.data.frame(rbind(c(my_confusion_table_train_class_red[2,2],
                               my_confusion_table_train_class_red[2,1]),
                          c(my_confusion_table_train_class_red[1,2],my_confusion_table_train_class_red[1,1])))
colnames(my_confusion_table_train_class_White)<-c("White","red")
rownames(my_confusion_table_train_class_White) <- c("White","red")

my_confusion_table_train_class_White

Precision, recall, F1score, for model– class white training

TP_train_White<-my_confusion_table_train_class_White[1,1]
FN_train_White<-my_confusion_table_train_class_White[1,2]
FP_train_White<-my_confusion_table_train_class_White[2,1]
TN_train_White<-my_confusion_table_train_class_White[2,2] 

my_confusion_table_precision_train_class_White <- TP_train_White/(TP_train_White + FP_train_White)  
my_confusion_table_recall_train_class_White <- TP_train_White / (TP_train_White + FN_train_White)
my_confusion_table_F1score_train_class_White <- 2*(my_confusion_table_precision_train_class_White*my_confusion_table_recall_train_class_White/(my_confusion_table_precision_train_class_White + my_confusion_table_recall_train_class_White))

paste(
  "precision_train_class_White = ", my_confusion_table_precision_train_class_White,
  "recall_train_class_White = ", my_confusion_table_recall_train_class_White,
    "F1_score_train_class_White = ",my_confusion_table_F1score_train_class_White
)
## [1] "precision_train_class_White =  0.995031616982836 recall_train_class_White =  0.997058157954288 F1_score_train_class_White =  0.996043856674579"

white wine results on test case

cross transpose two way table class red table for class white

my_confusion_table_test_class_White<-as.data.frame(rbind(c(my_confusion_table_test_class_red[2,2],
                               my_confusion_table_test_class_red[2,1]),
                          c(my_confusion_table_test_class_red[1,2],my_confusion_table_test_class_red[1,1])))
colnames(my_confusion_table_test_class_White)<-c("White","red")
rownames(my_confusion_table_test_class_White) <- c("White","red")

my_confusion_table_test_class_White

Precision, recall, F1score, for model– class white testing

TP_test_White<-my_confusion_table_test_class_White[1,1]
FN_test_White<-my_confusion_table_test_class_White[1,2]
FP_test_White<-my_confusion_table_test_class_White[2,1]
TN_test_White<-my_confusion_table_test_class_White[2,2] 

my_confusion_table_precision_test_class_White <- TP_test_White/(TP_test_White + FP_test_White)  
my_confusion_table_recall_test_class_White <- TP_test_White / (TP_test_White + FN_test_White)
my_confusion_table_F1score_test_class_White<-2*(my_confusion_table_precision_test_class_White*my_confusion_table_recall_test_class_White/(my_confusion_table_precision_test_class_White + my_confusion_table_recall_test_class_White))
paste(
  "precision_test_class_White = ", my_confusion_table_precision_test_class_White,
  "recall_test_class_White = ", my_confusion_table_recall_test_class_White,
  "F1_score_test_class_White = ",my_confusion_table_F1score_test_class_White
)
## [1] "precision_test_class_White =  0.997907949790795 recall_test_class_White =  0.995824634655532 F1_score_test_class_White =  0.996865203761755"

Overall accuracy of the model on train case

my_confusion_table_accuracy_train_class_red <- (TP_train_red + TN_train_red) /(TP_train_red + TN_train_red + FP_train_red + FN_train_red)
my_confusion_table_accuracy_train_class_red#should be the same if based of table for class White
## [1] 0.994014

Overall accuracy of the model on test case

my_confusion_table_accuracy_test_class_red <- (TP_test_red + TN_test_red) /(TP_test_red + TN_test_red + FP_test_red + FN_test_red)
my_confusion_table_accuracy_test_class_red#should be the same if based of table for class White
## [1] 0.9953846

ROC and AUC for White wine train case

library(ROCR)
my_train_data_updated$`my_data$Wine_type`<-ifelse(my_train_data_updated$`my_data$Wine_type`=="White","1","0")
my_train_data_updated$predictions_logistic_model1<-ifelse(my_train_data_updated$predictions_logistic_model1=="White","1","0")
pred_train<- prediction(as.matrix(as.numeric(my_train_data_updated[,(ncol(my_train_data_updated)-2)])), as.matrix(as.numeric(my_train_data_updated[,ncol(my_train_data_updated)])))
perf_train<-performance(pred_train, "tpr", "fpr")
plot(perf_train)

auc_train<-attr(performance(pred_train, "auc"),"y.values")
auc_train
## [[1]]
## [1] 0.9929351

ROC and AUC for White wine test case

library(ROCR)
set1_updated$`my_data$Wine_type`<-ifelse(set1_updated$`my_data$Wine_type`=="White","1","0")
set1_updated$predictions_logistic_model1<-ifelse(set1_updated$predictions_logistic_model1=="White","1","0")
pred_test<- prediction(as.matrix(as.numeric(set1_updated[,(ncol(set1_updated)-2)])), as.matrix(as.numeric(set1_updated[,ncol(set1_updated)])))
perf_test<-performance(pred_test, "tpr", "fpr")
plot(perf_test)

auc_test<-attr(performance(pred_test, "auc"),"y.values")
auc_test
## [[1]]
## [1] 0.99314

Would there be a difference in our model if we used a lasso informed model without associated x coefs?

y<-my_train_data[,ncol(my_train_data)]

lasso_model2_no_coefs<-glm(formula =  y~ my_train_data$my_data_cat.4 +                
                                        my_train_data$pH_scaled +          
                                        my_train_data$fixed_acidity_scaled  +
                                        my_train_data$chlorides_scaled +         
                                        my_train_data$residual_sugar_scaled +   
                                        my_train_data$density_scaled    +           
                                        my_train_data$total_sulfur_dioxide_scaled  +
                                        my_train_data$volatile_acidity_scaled +
                                        my_train_data$alcohol_scaled  +   
                                        my_train_data$sulphates_scaled, 
                           family = "binomial")

summary(lasso_model2_no_coefs)
## 
## Call:
## glm(formula = y ~ my_train_data$my_data_cat.4 + my_train_data$pH_scaled + 
##     my_train_data$fixed_acidity_scaled + my_train_data$chlorides_scaled + 
##     my_train_data$residual_sugar_scaled + my_train_data$density_scaled + 
##     my_train_data$total_sulfur_dioxide_scaled + my_train_data$volatile_acidity_scaled + 
##     my_train_data$alcohol_scaled + my_train_data$sulphates_scaled, 
##     family = "binomial")
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -5.3035   0.0017   0.0182   0.0563   6.9780  
## 
## Coefficients:
##                                           Estimate Std. Error z value
## (Intercept)                                 4.0628     0.2790  14.563
## my_train_data$my_data_cat.4                 2.4757     0.5828   4.248
## my_train_data$pH_scaled                     0.1028     0.2237   0.460
## my_train_data$fixed_acidity_scaled          0.5684     0.2941   1.933
## my_train_data$chlorides_scaled             -0.6789     0.1426  -4.761
## my_train_data$residual_sugar_scaled         4.0721     0.4528   8.994
## my_train_data$density_scaled               -5.0360     0.5585  -9.017
## my_train_data$total_sulfur_dioxide_scaled   2.3960     0.2127  11.264
## my_train_data$volatile_acidity_scaled      -1.3447     0.1622  -8.289
## my_train_data$alcohol_scaled               -2.0039     0.3249  -6.168
## my_train_data$sulphates_scaled             -0.4251     0.1808  -2.351
##                                           Pr(>|z|)    
## (Intercept)                                < 2e-16 ***
## my_train_data$my_data_cat.4               2.16e-05 ***
## my_train_data$pH_scaled                     0.6458    
## my_train_data$fixed_acidity_scaled          0.0533 .  
## my_train_data$chlorides_scaled            1.93e-06 ***
## my_train_data$residual_sugar_scaled        < 2e-16 ***
## my_train_data$density_scaled               < 2e-16 ***
## my_train_data$total_sulfur_dioxide_scaled  < 2e-16 ***
## my_train_data$volatile_acidity_scaled      < 2e-16 ***
## my_train_data$alcohol_scaled              6.91e-10 ***
## my_train_data$sulphates_scaled              0.0187 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 6500.75  on 5846  degrees of freedom
## Residual deviance:  407.05  on 5836  degrees of freedom
## AIC: 429.05
## 
## Number of Fisher Scoring iterations: 9
lasso_model2_no_coefs_prediction<-predict(lasso_model2_no_coefs, type= "response")
lasso_model2_no_coefs_prediction_classes<-ifelse(lasso_model2_no_coefs_prediction>=.5,"White","red")
my_confusion_table_train_no_coefs_class_red <-table(cbind(as.data.frame(my_train_data$`my_data$Wine_type`), lasso_model2_no_coefs_prediction_classes))
my_confusion_table_train_no_coefs_class_red
##                                  lasso_model2_no_coefs_prediction_classes
## my_train_data$`my_data$Wine_type`  red White
##                             red   1412    16
##                             White   14  4405
my_confusion_table_train_class_red
##        
##          red White
##   red   1406    22
##   White   13  4406

Prescision, recall, F1 score for model with no lambda selected coefs– red class training

TP_train_red<-my_confusion_table_train_no_coefs_class_red[1,1]
FN_train_red<-my_confusion_table_train_no_coefs_class_red[1,2]
FP_train_red<-my_confusion_table_train_no_coefs_class_red[2,1]
TN_train_red<-my_confusion_table_train_no_coefs_class_red[2,2] 

my_confusion_table_precision_train_no_coefs_class_red <- TP_train_red/(TP_train_red + FP_train_red)  
my_confusion_table_recall_train_no_coefs_class_red <- TP_train_red / (TP_train_red + FN_train_red)
my_confusion_table_F1score_train_no_coefs_class_red<-2*(my_confusion_table_precision_train_no_coefs_class_red*my_confusion_table_recall_train_no_coefs_class_red/(my_confusion_table_precision_train_no_coefs_class_red + my_confusion_table_recall_train_no_coefs_class_red))
paste(
  
  "precision_train_no_coefs_class_red = ", my_confusion_table_precision_train_no_coefs_class_red,
  "recall_train_no_coefs_class_red = ", my_confusion_table_recall_train_no_coefs_class_red,
    "F1_score_train_no_coefs_class_red = ",my_confusion_table_F1score_train_no_coefs_class_red
)
## [1] "precision_train_no_coefs_class_red =  0.990182328190743 recall_train_no_coefs_class_red =  0.988795518207283 F1_score_train_no_coefs_class_red =  0.989488437281009"
my_confusion_table_train_no_coefs_class_White<-as.data.frame(rbind(c(my_confusion_table_train_no_coefs_class_red[2,2],
                               my_confusion_table_train_no_coefs_class_red[2,1]),
                          c(my_confusion_table_train_no_coefs_class_red[1,2],my_confusion_table_train_no_coefs_class_red[1,1])))
colnames(my_confusion_table_train_no_coefs_class_White)<-c("White","red")
rownames(my_confusion_table_train_no_coefs_class_White) <- c("White","red")

my_confusion_table_train_no_coefs_class_White

Prescision, recall, F1 score for model with no lambda selected coefs– white class training

TP_train_White<-my_confusion_table_train_no_coefs_class_White[1,1]
FN_train_White<-my_confusion_table_train_no_coefs_class_White[1,2]
FP_train_White<-my_confusion_table_train_no_coefs_class_White[2,1]
TN_train_White<-my_confusion_table_train_no_coefs_class_White[2,2] 

my_confusion_table_precision_train_no_coefs_class_White <- TP_train_White/(TP_train_White + FP_train_White)  
my_confusion_table_recall_train_no_coefs_class_White <- TP_train_White / (TP_train_White + FN_train_White)
my_confusion_table_F1score_train_no_coefs_class_White<-2*(my_confusion_table_precision_train_no_coefs_class_White*my_confusion_table_recall_train_no_coefs_class_White/(my_confusion_table_precision_train_no_coefs_class_White + my_confusion_table_recall_train_no_coefs_class_White))

paste(
  "precision_train_no_coefs_class_White = ", my_confusion_table_precision_train_no_coefs_class_White,
  "recall_train_no_coefs_class_White = ", my_confusion_table_recall_train_no_coefs_class_White,
  "F1_score_train_no_coefs_class_White = ",my_confusion_table_F1score_train_no_coefs_class_White
)
## [1] "precision_train_no_coefs_class_White =  0.996380909296539 recall_train_no_coefs_class_White =  0.99683186241231 F1_score_train_no_coefs_class_White =  0.996606334841629"

SVM Models for Consideration

SVM– with radial kernel

library(e1071)
set.seed(1)
m.svm_radial_train<- svm(my_train_data_updated[,2:19], as.factor(my_train_data_updated[,20]), cross = 10, scale = FALSE, kernel = "radial", probability = TRUE)
summary(m.svm_radial_train)
## 
## Call:
## svm.default(x = my_train_data_updated[, 2:19], y = as.factor(my_train_data_updated[, 
##     20]), scale = FALSE, kernel = "radial", cross = 10, probability = TRUE)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  1 
##       gamma:  0.05555556 
## 
## Number of Support Vectors:  272
## 
##  ( 145 127 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  0 1
## 
## 10-fold cross-validation on training data:
## 
## Total Accuracy: 99.60664 
## Single Accuracies:
##  99.14384 99.65812 99.65812 99.82877 99.65812 99.82906 99.4863 99.48718 99.82906 99.48718

SVM with linear kernel

m.svm_linear_train<- svm(my_train_data_updated[,2:19], as.factor(my_train_data_updated[,20]), cross = 10, scale = FALSE, kernel = "linear", probability = TRUE)
summary(m.svm_linear_train)
## 
## Call:
## svm.default(x = my_train_data_updated[, 2:19], y = as.factor(my_train_data_updated[, 
##     20]), scale = FALSE, kernel = "linear", cross = 10, probability = TRUE)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  1 
##       gamma:  0.05555556 
## 
## Number of Support Vectors:  142
## 
##  ( 73 69 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  0 1
## 
## 10-fold cross-validation on training data:
## 
## Total Accuracy: 99.46981 
## Single Accuracies:
##  99.14384 99.82906 99.1453 99.65753 99.82906 99.31624 99.31507 99.1453 99.31624 100

Gather ensemble of logistic, svm radial, svm linear

m.svm_linear_train_fitted<-as.data.frame(as.integer(as.character(m.svm_linear_train$fitted)))
m.svm_radial_train_fitted<-as.data.frame(as.integer(as.character(m.svm_radial_train$fitted)))
my_train_data_updated$predictions_logistic_model1<-as.integer(as.character(my_train_data_updated$predictions_logistic_model1))
my_train_data_updated<-cbind(my_train_data_updated,m.svm_linear_train_fitted, m.svm_radial_train_fitted)

my_train_data_updated$majority_vote<-my_train_data_updated$predictions_logistic_model1+
                                              my_train_data_updated$`as.integer(as.character(m.svm_radial_train$fitted))`+
                                              my_train_data_updated$`as.integer(as.character(m.svm_linear_train$fitted))`
my_train_data_updated$majority_vote<-ifelse(my_train_data_updated$majority_vote>=2,1,0)

my_confusion_table_train_ensemble1_class_red<-table(my_train_data_updated$`my_data$Wine_type`, my_train_data_updated$majority_vote)
my_confusion_table_train_ensemble1_class_red
##    
##        0    1
##   0 1411   17
##   1    9 4410

Prescision, recall, F1 score for ensemble (logistic, svm-linear, svm-radial, majority vote) model– red class training

TP_train_red<-my_confusion_table_train_ensemble1_class_red[1,1]
FN_train_red<-my_confusion_table_train_ensemble1_class_red[1,2]
FP_train_red<-my_confusion_table_train_ensemble1_class_red[2,1]
TN_train_red<-my_confusion_table_train_ensemble1_class_red[2,2] 

my_confusion_table_precision_train_ensemble1_class_red <- TP_train_red/(TP_train_red + FP_train_red)  
my_confusion_table_recall_train_ensemble1_class_red <- TP_train_red / (TP_train_red + FN_train_red)
my_confusion_table_F1score_train_ensemble1_class_red<-2*(my_confusion_table_precision_train_ensemble1_class_red*my_confusion_table_recall_train_ensemble1_class_red/(my_confusion_table_precision_train_ensemble1_class_red + my_confusion_table_recall_train_ensemble1_class_red))
paste(
  
  "precision_train_ensemble1_class_red = ", my_confusion_table_precision_train_ensemble1_class_red,
  "recall_train_ensemble1_class_red = ", my_confusion_table_recall_train_ensemble1_class_red,
    "F1_score_train_ensemble1_class_red = ",my_confusion_table_F1score_train_ensemble1_class_red
)
## [1] "precision_train_ensemble1_class_red =  0.993661971830986 recall_train_ensemble1_class_red =  0.988095238095238 F1_score_train_ensemble1_class_red =  0.990870786516854"

Prescision, recall, F1 score for ensemble (logistic, svm-linear, svm-radial, majority vote) model– white class training

my_confusion_table_train_ensemble1_class_White<-as.data.frame(rbind(c(my_confusion_table_train_ensemble1_class_red[2,2],
                               my_confusion_table_train_ensemble1_class_red[2,1]),
                          c(my_confusion_table_train_ensemble1_class_red[1,2],my_confusion_table_train_ensemble1_class_red[1,1])))
colnames(my_confusion_table_train_ensemble1_class_White)<-c("White","red")
rownames(my_confusion_table_train_ensemble1_class_White) <- c("White","red")

my_confusion_table_train_ensemble1_class_White
TP_train_White<-my_confusion_table_train_ensemble1_class_White[1,1]
FN_train_White<-my_confusion_table_train_ensemble1_class_White[1,2]
FP_train_White<-my_confusion_table_train_ensemble1_class_White[2,1]
TN_train_White<-my_confusion_table_train_ensemble1_class_White[2,2] 

my_confusion_table_precision_train_ensemble1_class_White <- TP_train_White/(TP_train_White + FP_train_White)  
my_confusion_table_recall_train_ensemble1_class_White<- TP_train_White / (TP_train_White + FN_train_White)
my_confusion_table_F1score_train_ensemble1_class_White<-2*(my_confusion_table_precision_train_ensemble1_class_red*my_confusion_table_recall_train_ensemble1_class_White/(my_confusion_table_precision_train_ensemble1_class_White + my_confusion_table_recall_train_ensemble1_class_White))
paste(
  
  "precision_train_ensemble1_class_White = ", my_confusion_table_precision_train_ensemble1_class_White,
  "recall_train_ensemble1_class_White = ", my_confusion_table_recall_train_ensemble1_class_White,
    "F1_score_train_ensemble1_class_White = ",my_confusion_table_F1score_train_ensemble1_class_White
)
## [1] "precision_train_ensemble1_class_White =  0.996159927716286 recall_train_ensemble1_class_White =  0.9979633401222 F1_score_train_ensemble1_class_White =  0.994560603503453"

Which cases did the SVM-linear and SVM-radial disagree on? Which was correct? (From the training set)

my_train_data_updated[which(my_train_data_updated$`as.integer(as.character(m.svm_linear_train$fitted))`!=my_train_data_updated$`as.integer(as.character(m.svm_radial_train$fitted))`),]